To-do:
Lets start importing all librairies.
# our vast librairy, avoids having too many line of codes in this notebook
import vast2 as vst
# librairies forvisualization
from ipywidgets import IntProgress
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import heapq
import os
import re
# librairies for audio
from scipy import signal
from scipy.io import wavfile
from pydub import AudioSegment
# for parallelizing
from multiprocessing import Pool
import multiprocessing
from tqdm import tqdm_notebook as tqdm
# widgets
import ipywidgets as widgets
# path to files
path = "https://github.com/zhufangda/Telecom_Paristech-3rd_year/raw/master/"\
+"DATA920_Visualization/2018%20Mini-Challenge%201/"
# load usefull data using our vast function
data, kasios_records, nb_categories, \
cat_bird, i_bp = vst.load_data(path, "AllBirdsv4.csv",
"Test_Birds_Location.csv")
# some functions and widgets for the interactive distributions and map
data['Year'] = data['Year'].fillna(data['Year'].min()-1)
dates = list(np.int_(data['Year'].copy().drop_duplicates().sort_values()))
qualities = data['Quality'].drop_duplicates()
categories_audio = data['Vocalization_type'].drop_duplicates()
map_contour = vst.vectorize(path, "lekagul_roadways_2018.png")
def update_plot(date, species, quality, field, vocal_type):
fig, ax = plt.subplots(figsize=(15,6))
data_to_plot = data[data['Year'].between(date[0], date[1]+1)]
data_to_plot = data_to_plot.loc[data_to_plot['English_name'].isin(species)]
data_to_plot = data_to_plot.loc[data_to_plot['Quality'].isin(quality)]
data_to_plot = data_to_plot.loc[data_to_plot['Vocalization_type'].isin(vocal_type)]
if not data_to_plot.empty:
if field == 'Year':
data_to_plot.groupby(['Year','IsRCBP'])['X'].size().unstack()\
.plot(kind='bar',stacked=True, ax=ax,
color=['#009432','#0652DD'], zorder=2)
ax.legend(["Other birds", "Blue Pipits"])
plt.ylabel("# of records", fontsize=12)
plt.xlabel('Year', fontsize=12)
plt.xticks(rotation=45, ha='right')
ax.grid(which='major', axis='y', linestyle='--', zorder=1)
labels = [int(float(item.get_text())) for item in ax.get_xticklabels()]
if 1982 in labels:
labels[labels.index(1982)] = 'NaN'
ax.set_xticklabels(labels)
if field == 'English_name':
data_cat = pd.Categorical(data_to_plot['English_name'],ordered=False)
ax = data_cat.value_counts().plot(kind='bar', color='#009432',
label='Other birds', zorder = 2)
if 'Rose-crested Blue Pipit' in species:
indice_bp = sorted(species).index('Rose-crested Blue Pipit')
ax.get_children()[indice_bp].set_color('#0652DD')
ax.get_children()[indice_bp].set_label('Blue Pipits')
plt.xticks(rotation=30, ha='right')
plt.ylabel("# of records", fontsize=12)
ax.legend()
ax.grid(which='major', axis='y', linestyle='--', zorder=1)
ax.set_title("Number of records: {}".format(len(data_to_plot)), fontsize=16)
def update_map(date, species, quality, vocal_type):
colors = ['#009432','#a6cee3','#b2df8a','#fb9a99','#e31a1c','#fdbf6f',
'#ff7f00','#33a02c','#cab2d6','#6a3d9a','#ffff99','#b15928','#004d40','#7B1FA2',
'#7C4DFF','#795548','#0652DD','#B53471','#FF9800','#8BC34A','#CDDC39',
'#b71c1c','#FFC107','#607D8B']
fig, ax = plt.subplots(figsize=(10,10))
data_to_plot = data[data['Year'].between(date[0], date[1]+1)]
data_to_plot = data_to_plot.loc[data_to_plot['English_name'].isin(species)]
data_to_plot = data_to_plot.loc[data_to_plot['Quality'].isin(quality)]
data_to_plot = data_to_plot.loc[data_to_plot['Vocalization_type'].isin(vocal_type)]
vst.print_map(ax, map_contour, "Number of records: {}".format(len(data_to_plot)))
if not data_to_plot.empty:
for i, categ in enumerate(species):
X = data_to_plot.loc[data['English_name'] == categ]['X'].tolist()
Y = data_to_plot.loc[data['English_name'] == categ]['Y'].tolist()
element = ax.scatter(X,Y, color=colors[cat_bird.categories.tolist().index(categ)], alpha=1, marker='*', zorder=5, label=categ)
plt.legend(bbox_to_anchor=(1.1, 1.))
options_dates = [(str(i), i) for i in dates]
widget_date = widgets.SelectionRangeSlider(options=options_dates, index=(0, len(dates)-1),
description='Date', disabled=False,
continuous_update=False)
widget_species = widgets.SelectMultiple(options=cat_bird.categories.tolist(),
value=cat_bird.categories.tolist(), rows=6,
description='Species', disabled=False)
widget_quality = widgets.SelectMultiple(options=qualities.tolist(),
value=qualities.tolist(), rows=6,
description='Quality', disabled=False)
widget_choice = widgets.ToggleButtons(options=['Year', 'English_name'], description='Field X:',
disabled=False, button_style='success',
tooltips=['Plot the distribution over years',
'Plot the distribution over species'])
widgets.Dropdown(options=['Year', 'English_name', 'Quality'],
value='Year', description='field X:', disabled=False)
widget_vocaltype = widgets.SelectMultiple(options=categories_audio.tolist(),
value=categories_audio.tolist(), rows=2,
description='Vocal_type', disabled=False)
left_widget = widgets.VBox([widget_date, widget_vocaltype, widget_choice])
all_widgets = widgets.HBox([left_widget, widget_species, widget_quality])
out = widgets.interactive_output(update_plot, {'date': widget_date,
'species': widget_species,
'quality':widget_quality,
'field':widget_choice,
'vocal_type':widget_vocaltype})
widget_date2 = widgets.SelectionRangeSlider(options=options_dates, index=(0, len(dates)-1),
description='Date', disabled=False,
continuous_update=False)
widget_species2 = widgets.SelectMultiple(options=cat_bird.categories.tolist(),
value=cat_bird.categories.tolist(), rows=6,
description='Species', disabled=False)
widget_quality2 = widgets.SelectMultiple(options=qualities.tolist(),
value=qualities.tolist(), rows=6,
description='Quality', disabled=False)
widget_vocaltype2 = widgets.SelectMultiple(options=categories_audio.tolist(),
value=categories_audio.tolist(), rows=4,
description='Vocal_type', disabled=False)
left_widget2 = widgets.VBox([widget_date2, widget_vocaltype2,])
all_widgets2 = widgets.HBox([left_widget2, widget_species2, widget_quality2])
out2 = widgets.interactive_output(update_map, {'date': widget_date2,
'species': widget_species2,
'quality':widget_quality2,
'vocal_type':widget_vocaltype})
# This plot allow to cjeck if we have enough records while selecting some fields
display(all_widgets, out)
# The same, but on a map
display(all_widgets2, out2)
# PLot the map with all records and Kasios
vst.plotmap(data, kasios_records, map_contour)
# delete all useless data : keep only good quality, call and songs
data_sounds = data.loc[data['Quality'] <= 'C']
data_sounds = data_sounds.loc[data.Vocalization_type.isin(['song' , 'call','call, song'])]
print("We go from {} files to {} with better quality".format(len(data),len(data_sounds)))
cat_bird2 = pd.Categorical(data_sounds['English_name'],ordered=False)
vst.plot_new_distribub(cat_bird2)
#transform mp3 into wav
sound = AudioSegment.from_mp3("test.mp3")
sound = sound.set_frame_rate(44100)
sound = sound.set_channels(1)
sound.export("test.wav", format="wav")
# open the wav file
rate, samples = wavfile.read("test.wav")
print(rate) ## nb frames per second
print(samples.shape)
n_samples = len(samples)
# if stereo, keep only one channel
if samples.ndim == 2:
samples = samples[:,0]
# get the frequencies, times and the spectrogram using our log_specgram function
freqs, times, spectrogram = vst.log_specgram(samples, rate)
# plot the magnitude and the spectrogram on the scale time scale
vst.plot_magnitude_spectrogram(samples, rate, freqs, times, spectrogram)
# perform the fast fourier transform
fourier, power = vst.custom_fft(samples, rate)
vst.plot_fft(fourier, power)
Using the conclusions of T Papadopoulos, S.J. Roberts, K. Willis, Automated bird sound recognition in realistic settings, Sep.2018
# mean of the 1% most energetic samples
mean_high_NRJ_samples = np.mean(heapq.nlargest(n_samples//100, np.abs(samples)))
# select only samples of that recording that have power of at least 0.25 of the estimated highest level
samples = np.array([sample for i_sample, sample in enumerate(samples) if np.abs(sample) > 0.25 * mean_high_NRJ_samples])
# update nb frames
n_samples = len(samples)
freqs, times, spectrogram = vst.log_specgram(samples, rate)
fourier, power = vst.custom_fft(samples, rate)
vst.plot_magnitude_spectrogram(samples, rate, freqs, times, spectrogram)
vst.plot_fft(fourier, power)
Already done with the function log_specgram(audio, sample_rate, window_size=20, step_size=10, eps=1e-10)
window_size=20
step_size=10
# keep only fq between 1kHz and 10 kHz
indices = [i for i, fq in enumerate(freqs) if fq>1000 and fq<10000]
freqs_red = freqs[indices]
spectrogram_red = spectrogram[:,indices]
from sklearn.preprocessing import normalize
sequence_spectogram_normalized = normalize(spectrogram_red, norm="l1", axis=1)
vst.plot_magnitude_spectrogram(samples, rate, freqs_red, times, sequence_spectogram_normalized)
# Lets split the spectrogram into sequences
list_sequences = vst.get_sequences(sequence_spectogram_normalized)
# let get the first sequence.
sequence_spectogram_normalized = list_sequences[0]
vst.plot_spectrogram(freqs_red, times, sequence_spectogram_normalized)
# Create edges for the histograms from e_min to e_max
f_mean_edges = vst.edges(1000, 10000, 100)
f_std_edges = vst.edges(1000, 4000, 50)
f_mode_edges = vst.edges(1000, 10000, 100)
f_delta_mode_edges = vst.edges(-2000, 2000, 50)
n_frames_sequence = min(sequence_spectogram_normalized.shape[0], 100)
# calculate mean, std, mode and delta_mode
f_mean = [np.sum(freqs_red * sequence_spectogram_normalized[i])
for i in range(n_frames_sequence)]
f_std = [(np.abs(np.sum(sequence_spectogram_normalized[i]
* ((freqs_red - f_mean[i]) ** 2) ))) ** (0.5)
for i in range(n_frames_sequence)]
f_mode = [freqs_red[np.argmax(sequence_spectogram_normalized[i])]
for i in range(n_frames_sequence)]
f_delta_mode = np.roll(f_mode, -1) - f_mode
# create histogramms
sequence_histogram1, f_mean_edges, f_std_edges = np.histogram2d(f_mean, f_std, bins=(f_mean_edges, f_std_edges))
sequence_histogram2, f_mode_edges = np.histogram(f_mode, bins=f_mode_edges)
sequence_histogram3, f_mode_edges, f_delta_mode_edges = np.histogram2d(f_mode, f_delta_mode, bins=(f_mode_edges, f_delta_mode_edges))
features1 = sequence_histogram1.flatten()
features2 = sequence_histogram2
features3 = sequence_histogram3.flatten()
# Plot the 3 histogram = our features for this sequence
vst.plot2_features("test sequence", sequence_histogram1, sequence_histogram2,
sequence_histogram3, freqs_red)
Transform all mp3 from the In folder that correspond to a good record into wav to Out folder with a rate of 44100 Hz
# transform all mp3 into wav
f = IntProgress(min=0, max=len(data), description='Load files:', bar_style='success') # instantiate the bar
print("progress")
display(f) # display the bar
for file in os.listdir("Sounds/In/"):
record_id = int(max(re.findall('\d+', file), key=len))
# if the ID is in data_sounds, we apply
if record_id in data_sounds['File ID'].values:
sound = AudioSegment.from_mp3("Sounds/In/"+file)
sound = sound.set_frame_rate(44100)
sound = sound.set_channels(1)
sound.export("Sounds/Out/" + str(record_id) + ".wav", format="wav")
f.value += 1
print("done !")
# create all spectrogramms using parallellizing
cores = multiprocessing.cpu_count()
pool = Pool(processes=cores)
file_list = list(os.listdir("Sounds/Out/"))
all_spectrograms = [[] for _ in range(nb_categories)]
for category, spectrogram, freqs_red in tqdm(pool.imap_unordered(get_spectrogram, file_list), total=len(file_list)):
all_spectrograms[category].append(spectrogram)
# retrieve sequences from spectrograms
all_sequences = get_sequences_per_category(all_spectrograms)
# create the dataset from the sequences
dataset = pd.DataFrame(columns=range(10101))
n_rows = 0
for category in range(nb_categories):
for j, sequence in enumerate(all_sequences[category]):
dataset.loc[n_rows] = get_features(category, sequence, freqs_red)
n_rows += 1
# create the dataset
dataset.to_csv("dataset.csv", float_format='%.3f')
dataset2 = dataset.copy()
dataset2 = dataset2.astype(int)
dataset2.to_csv("dataset2.csv")
len(dataset)
Instead of executing previous lines of commands, just load the dataset.
dataset = pd.read_csv("dataset2.csv")
dataset = dataset.drop(dataset.columns[0], axis=1)
dataset_category = dataset.groupby(['0']).mean()
dataset_category.insert(0, '0', 0.0)
categories = []
features1 = []
features2 = []
features3 = []
for row in dataset_category.iterrows():
index, data_i = row
categories.append(index)
features1.append(np.reshape(data_i.values[0:5000], (50, 100)).T)
features2.append(data_i.values[5000:5100])
features3.append(np.reshape(data_i.values[5100:10100], (50, 100)).T)
freqs = np.array(range(1050, 10000, 50))
for i, H1, H2, H3 in zip(categories, features1, features2, features3):
vst.plot2_features(cat_bird.categories.tolist()[i], H1, H2, H3, freqs)
# using the ssim (measure to compare images)
from skimage.measure import compare_ssim
# just calculate the mean... of the three ssim
ssim_beetween = np.array([[np.mean([compare_ssim(features1[i], features1[j], full=True)[0],
compare_ssim(features2[i], features2[j], full=True)[0],
compare_ssim(features3[i], features3[j], full=True)[0]])
for i in range(nb_categories)]
for j in range(nb_categories)])
vst.plot_heatmap(ssim_beetween, cat_bird.categories.tolist(),
cat_bird.categories.tolist(), "Similarity beetween species",
cmap="YlGn", cbarlabel="ssim", txt=True)
import networkx as nx
from networkx.readwrite import json_graph
import json
from IPython.display import display, HTML, Javascript
labels = {}
for i, cat in enumerate(cat_bird.categories.tolist()):
labels[i]=cat
def create_graph(similarity, threshold):
n = similarity.shape[0]
G = nx.Graph()
G.add_nodes_from([i for i in range(n)])
for i in range(n):
for j in range(i+1, n):
if similarity[i,j]> threshold:
G.add_edge(i, j, weight=( similarity[i,j] - threshold) * 100 )
return G
G = create_graph(ssim_beetween, 0.75)
for ix in G.nodes():
G.node[ix]['category'] = labels[ix]
G.node[ix]['isKasios'] = 0
for ix,deg in G.degree():
G.node[ix]['degree'] = deg
G.node[ix]['parity'] = (1-deg%2)
#G.node[ix]['katz'] = 0.1
for ix,katz in nx.katz_centrality(G).items():
G.node[ix]['katz'] = katz
# create a json file
datajson = json_graph.node_link_data(G)
with open('graphsim.json', 'w') as f:
json.dump(datajson, f, indent=4)
# interactive graph ; the edge width depends of the similarity beetween species
display(HTML(vst.js_getResults))
f = IntProgress(min=0, max=15, description='Load files:', bar_style='success') # instantiate the bar
display(f) # display the bar
for file in os.listdir("Sounds_Kasios/In/"):
record_id = int(max(re.findall('\d+', file), key=len))
# if the ID is in data_sounds, we apply
sound = AudioSegment.from_mp3("Sounds_Kasios/In/"+file)
sound = sound.set_frame_rate(44100)
sound = sound.set_channels(1)
sound.export("Sounds_Kasios/Out/" + str(record_id) + ".wav", format="wav")
f.value += 1
nb_kasios = f.value
list_kasios_sequences = [[] for i in range(nb_kasios)]
for i in range(nb_kasios):
t_spectogram_normalized, t_freqs_red = get_spectrogram_kasios(i+1)
list_kasios_sequences[i].extend(get_sequences(t_spectogram_normalized))
# create the dataset from the sequences
dataset_kasios = pd.DataFrame(columns=range(10101))
n_rows = 0
for id in range(nb_kasios):
for _, sequence in enumerate(list_kasios_sequences[id]):
dataset_kasios.loc[n_rows] = get_features(id, sequence, t_freqs_red)
n_rows += 1
dataset_kasios.to_csv("dataset_kasios.csv", float_format='%.3f')
dataset_kasios = pd.read_csv("dataset_kasios.csv")
dataset_kasios = dataset_kasios.drop(dataset_kasios.columns[0], axis=1)
kasios_mean = dataset_kasios.groupby(['0']).mean()
kasios_mean.insert(0, '0', 0.0)
l_kasios_id = []
features1k = []
features2k = []
features3k = []
for row in kasios_mean.iterrows():
index, data_i = row
l_kasios_id.append(index)
features1k.append(np.reshape(data_i.values[0:5000], (50, 100)).T)
features2k.append(data_i.values[5000:5100])
features3k.append(np.reshape(data_i.values[5100:10100], (50, 100)).T)
freqs = np.array(range(1050, 10000, 50))
for i, H1, H2, H3 in zip(l_kasios_id, features1k, features2k, features3k):
vst.plot2_features("Kasios record #" + str(int(i+1)), H1, H2, H3, freqs)
nb_kasios = len(l_kasios_id)
kasios_label = ["Kasios record #" + str(int(i+1)) for i in range(nb_kasios)]
# test of ssim beetween kasios and species
ssim_k = np.array([[np.mean([compare_ssim(features1[i], features1k[j], full=True)[0],
compare_ssim(features2[i], features2k[j], full=True)[0],
compare_ssim(features3[i], features3k[j], full=True)[0]])
for i in range(nb_categories)]
for j in range(nb_kasios)])
vst.plot_heatmap(ssim_k, kasios_label, cat_bird.categories.tolist(),
"Similarity beetween Kasios birds and species",
cmap="YlGn", cbarlabel="ssim")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
# dataset with all 10100 fatures
dataset3 = dataset.copy()
y = dataset3['0']
X = dataset3.drop(dataset3.columns[0], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
# standardization of all features
scaler1 = StandardScaler()
scaler1.fit(X_train)
X_train = scaler1.transform(X_train)
X_test = scaler1.transform(X_test)
kasios_id = dataset_kasios['0']
X_kasios = dataset_kasios.drop(dataset_kasios.columns[0], axis=1)
X_kasios = scaler1.transform(X_kasios)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
random_forest = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=1))
random_forest.fit(X_train, y_train)
random_forest.score(X_test, y_test)
import scikitplot as skplt
skplt.metrics.plot_confusion_matrix(y_test, random_forest.predict(X_test), normalize=True, figsize=(16,9))
probas = random_forest.predict_proba(X_kasios)
# transform into an array by kasios record // aggregate by mean
i = 0
probas_kasios = []
for j in range(nb_kasios):
probas_kasios.append(np.mean(np.array([probas[i+k] for k in range(kasios_id.value_counts()[j])]), axis=0))
i+=kasios_id.value_counts()[j]
probas_kasios = np.array(probas_kasios)
vst.plot_heatmap(probas_kasios,kasios_label, cat_bird.categories.tolist(),
"probability Kasios = specy, 10100 features",
cmap="YlGn", cbarlabel="probability")
probas_kasios_labels = []
for k in range(nb_kasios):
label = ""
best_probas = heapq.nlargest(5, zip(np.round(probas_kasios[k,:],2), cat_bird.categories.tolist()))
for i, (proba, catgory) in enumerate(best_probas):
label += "P" + str(i+1) + ": " + str(proba)+ " - " + catgory + "\n"
probas_kasios_labels.append(label)
def create_graph2(similarity, probas, threshold1=0.75, threshold_probas=0.1):
(p, n) = probas.shape
G = nx.Graph()
for i in range(n+p):
G.add_node(i)
for i in range(n):
for j in range(i+1, n):
if (similarity[i,j] > threshold1):
G.add_edge(i, j, weight= (similarity[i,j] - threshold1) * 100, stype = 0)
for j in range(p):
for i in range(n):
if (probas[j, i] > threshold_probas):
G.add_edge(n + j, i, weight = 2 * np.sqrt((probas[j, i] - threshold_probas) * 100), stype = 1)
return G
G2 = create_graph2(ssim_beetween, probas_kasios)
for ix in G2.nodes():
if ix < nb_categories:
G2.node[ix]['category'] = cat_bird.categories.tolist()[ix]
G2.node[ix]['tip'] = cat_bird.categories.tolist()[ix]
G2.node[ix]['isKasios'] = 0
else:
G2.node[ix]['category'] = 'K' + str(ix - nb_categories + 1)
G2.node[ix]['tip'] = probas_kasios_labels[ix - nb_categories]
G2.node[ix]['isKasios'] = 1
for ix,deg in G2.degree():
G2.node[ix]['degree'] = deg
G2.node[ix]['parity'] = (1-deg%2)
#G.node[ix]['katz'] = 0.1
for ix,katz in nx.katz_centrality(G2).items():
G2.node[ix]['katz'] = katz
# create a json file
datajson2 = json_graph.node_link_data(G2)
with open('graphsim2.json', 'w') as f:
json.dump(datajson2, f, indent=4)
display(HTML(vst.js_getResults2))
list_kasios_bp = [i for i in range(nb_kasios) if probas_kasios[i,16]>0.1]
print("--------------3 MOST PROBABLE SPECIES BY KASIOS RECORD------------------------")
print("******************************************************************************")
for j in range(nb_kasios):
bestprobas = heapq.nlargest(3, zip(probas_kasios[j,:], cat_bird.categories.tolist()))
print("Kasios record #", j+1, " : ")
for i, bestproba in enumerate(bestprobas):
print(" {} : {} , proba {}".format(i+1, bestproba[1], bestproba[0]))
from scipy.stats import mode
predict = random_forest.predict(X_kasios)
i = 0
predict_kasios = []
for j in range(nb_kasios):
predict_kasios.append(mode(np.array([predict[i+k] for k in range(kasios_id.value_counts()[j])]), axis=None))
i+=kasios_id.value_counts()[j]
predict_kasios = np.array(predict_kasios).T[0][0].astype(int)
def plot_comparaison_predict(prediction):
fig, axs = plt.subplots(len(l_kasios_id), 9, figsize=(18,40))
plt.subplots_adjust(hspace = 0.5)
for i, H1, H2, H3 in zip(l_kasios_id, features1k, features2k, features3k):
title_bp = ""
if i in list_kasios_bp:
title_bp = ", may be a Blue pipit "
for j in range(9):
axs[int(i),j].axis('off')
vst.plot3_features(cat_bird.categories.tolist()[i_bp], features1[i_bp],
features2[i_bp], features3[i_bp],
freqs, axs[int(i),0:3])
vst.plot3_features("K#" + str(int(i + 1)) + title_bp, H1, H2, H3,
freqs, axs[int(i),3:6])
vst.plot3_features("predicted : K#" + str(int(i + 1)) + "=" +\
cat_bird.categories.tolist()[prediction[int(i)]],
features1[prediction[int(i)]],
features2[prediction[int(i)]],
features3[prediction[int(i)]],
freqs, axs[int(i),6:9])
plot_comparaison_predict(predict_kasios)
data_with_kasios = np.concatenate((X,dataset_kasios.groupby(['0']).mean()),axis=0)
from sklearn import manifold
tsne = manifold.TSNE(n_components=2, init='random',
random_state=0, perplexity=100)
Y = tsne.fit_transform(data_with_kasios)
np.save('resultTSNE.npy', Y)
def plot_red_dim(points, title, labels, show=-1):
if show == -1:
colors = ['#a6cee3','#009432','#b2df8a','#33a02c','#fb9a99','#e31a1c',
'#e84118','#b15928','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a',
'#004d40','#7B1FA2','#7C4DFF','#795548','#0652DD','#B53471',
'#FF9800','#8BC34A','#CDDC39','#b71c1c','#FFC107','#607D8B']
else:
colors = ['#0652DD' if i==show else '#009432' for i in range(19)]
a1 = points[:-15, 0]
a2 = points[:-15, 1]
df = pd.DataFrame(dict(a1=a1, a2=a2, label=labels))
groups = df.groupby('label')
points_kasios = points[-15:,:]
fig, ax = plt.subplots(figsize=(15,15))
#ax.scatter(points_all_birds[:, 0], points_all_birds[:, 1],marker='.' , color=colors, cmap='tab20', label=labels)
for i, (name, group) in enumerate(groups):
ax.scatter(group.a1, group.a2, marker='.', color=colors[i], label=cat_bird.categories.tolist()[name])
ax.legend(bbox_to_anchor=(1.1, 1.05))
for i in range(15):
if show == -1:
colork = colors[predict_kasios[i]]
else:
if i in list_kasios_bp:
colork='#0652DD'
else:
colork = 'orange'
ax.text(points_kasios[i,0], points_kasios[i, 1], str(i+1), color='black',
ha="center", va="center",
bbox={'pad':0.4, 'boxstyle':'circle',
'edgecolor':'none', 'facecolor':colork})
plt.axis('off')
plt.title(title, fontsize = 20)
#plt.legend(loc=2, scatterpoints=1)
plt.show()
plot_red_dim(np.load('resultTSNE.npy'), 't-SNE - Representation 2-dimensions', y, show=16)
import umap
embedding = umap.UMAP(n_neighbors=25,
min_dist=0.3,
metric='correlation').fit_transform(data_with_kasios)
np.save('resultUMAP.npy', embedding)
plot_red_dim(np.load('resultUMAP.npy'), 'UMAP - Predicted species for the Kasios', y)
plot_red_dim(np.load('resultUMAP.npy'), 'UMAP - Supposed Blue pipits (in blue!) after the classification', y, show=16)
vst.plotmap(data, kasios_records, map_contour, list_kasios_bp)